Reading ENIGH microdata (CSV files)

Check Energy Expenses

Check all the energy expenses in the households expenses data set and people expenses data sets are the same than the ones reported in the households condensed data set.

df_hsd_condens$energia is equal to the sum of gastoshogar.gasto_tri if clave is equal to G009-G016, R001, R003 plus gastospersona.gasto_tri if clave is equal to G009-G016, R001, R003.

Code (clave) Description
G009 Liquefied petroleum gas
G010 Petroleum
G011 Diesel
G012 Carbon
G013 Firewood
G014 Fuel to heat
G015 Candles
G016 Other fuels
R001 Electricity
R003 Natural Gas
#df_person_energy
# Create household id, this will be the key attribute in the SQL table
df_hsd_energy$id_household <- paste0(df_hsd_energy$folioviv, df_hsd_energy$foliohog)
df_person_energy$id_household <- paste0(df_person_energy$folioviv, df_person_energy$foliohog)

Grouping data sets by household id and expense id (clave)

df_hsd_energy_by_hsd <- df_hsd_energy %>%
  group_by(id_household, clave) %>% replace(is.na(.), 0) %>% 
  summarise(gasto_tri = sum(gasto_tri))

df_psn_energy_by_hsd <- df_person_energy %>%
  group_by(id_household, clave) %>% replace(is.na(.), 0) %>% 
  summarise(gasto_tri = sum(gasto_tri))

head(df_hsd_energy_by_hsd)
head(df_psn_energy_by_hsd)

Merging data sets by id_household using energy expenses related data

df_energy_by_hsd <- merge(df_hsd_energy_by_hsd, df_psn_energy_by_hsd, 
                          by="id_household", all.y = TRUE, all.x = TRUE) %>% 
  replace(is.na(.), 0)

Reshaping data

df_hsd_energy_by_hsd <- as.data.frame(df_hsd_energy_by_hsd)

df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G009'] <- 'LPG'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G010'] <- 'Petroleum'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G011'] <- 'Diesel'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G012'] <- 'Carbon'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G013'] <- 'Firewood'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G014'] <- 'Fuel_to_heat'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G015'] <- 'Candles'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='G016'] <- 'Other'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='R001'] <- 'Electricity'
df_hsd_energy_by_hsd$clave[df_hsd_energy_by_hsd$clave=='R003'] <- 'NG'

#df_hsd_energy_by_hsd$clave <-as.factor(df_hsd_energy_by_hsd$clave)
df_hsd_by_hsd_wd <- reshape(df_hsd_energy_by_hsd, v.names = 'gasto_tri',
                            timevar='clave', idvar="id_household", sep = "_",
                            direction="wide")
#Change columns names 
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_LPG')] <- 'LPG'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Petroleum')] <- 'Petroleum'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Diesel')] <- 'Diesel'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Carbon')] <- 'Carbon'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Firewood')] <- 'Firewood'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Fuel_to_heat')] <- 'Fuel_to_heat'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Candles')] <- 'Candles'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Other')] <- 'Other'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_Electricity')] <- 'Electricity'
colnames(df_hsd_by_hsd_wd)[which(names(df_hsd_by_hsd_wd) == 'gasto_tri_NG')] <- 'NG'

df_hsd_by_hsd_wd

Store data of energy expenses by household

write.csv(df_hsd_by_hsd_wd, file = 'outputs/2018_enigh_ener-exp_hsd_wide.csv',
          fileEncoding="UTF-8", row.names=FALSE)

Descriptive Statistics

descr(df_hsd_by_hsd_wd, stats = c("mean", "sd", "min", "med", "max", "n.valid", 
                                  "pct.valid"), transpose = TRUE)

Box Plots

pal_plot <- c('grey','red', 'rgb(7,40,89)', 'green')
pal_plot <- setNames(pal_plot, c("elect", "ng", "candles", "lpg"))

#df_hsd_by_hsd_wd[is.na(df_hsd_by_hsd_wd)] = 0

pl_energy_exp_box <- plot_ly(type = 'box') %>%
  add_boxplot(y = df_hsd_by_hsd_wd$Electricity, 
              boxpoints = 'outliers', 
              name = "Electricity", 
              color = list(color =pal_plot['elect']),
              marker = list(color = pal_plot['elect']), 
              line=list(color = pal_plot['elect']) ) %>%
  add_boxplot(y = df_hsd_by_hsd_wd$NG, 
              boxpoints = 'outliers', 
              name = "Natural Gas", 
              color = list(color =pal_plot['ng']),
              marker = list(color = pal_plot['ng']), 
              line=list(color = pal_plot['ng']) ) %>%
  add_boxplot(y = df_hsd_by_hsd_wd$Candles, 
              boxpoints = 'outliers', 
              name = "Candles", 
              color = list(color =pal_plot['candles']),
              marker = list(color = pal_plot['candles']), 
              line=list(color = pal_plot['candles']) ) %>%
  add_boxplot(y = df_hsd_by_hsd_wd$LPG, 
              boxpoints = 'outliers', 
              name = "LPG", 
              color = list(color =pal_plot['lpg']),
              marker = list(color = pal_plot['lpg']), 
              line=list(color = pal_plot['lpg']) ) %>%
  layout(title = "Boxplots of Expenses by Household per Type of Energy", 
         yaxis = list(title = "$[MXN]", range = c(0, 5000)))

pl_energy_exp_box

plotly_IMAGE(pl_energy_exp_box, format = "png", out_file = "./figs/box_energy_expenses.png" )

Boxplots of energy expenses per household

Analysis of Expenses by Dwelling

Grouping data sets by dwelling id and expense id (clave)

df_hsd_energy_by_dwell <- df_hsd_energy %>%
  group_by(folioviv, clave) %>% replace(is.na(.), 0) %>% 
  summarise(gasto_tri = sum(gasto_tri))

df_psn_energy_by_dwell <- df_person_energy %>%
  group_by(folioviv, clave) %>% replace(is.na(.), 0) %>% 
  summarise(gasto_tri = sum(gasto_tri))

head(df_hsd_energy_by_dwell)
head(df_psn_energy_by_dwell)

Merging data sets by id_household using energy expenses related data

df_energy_by_dwell <- merge(df_hsd_energy_by_dwell, df_psn_energy_by_dwell, 
                          by="folioviv", all.y = TRUE, all.x = TRUE) %>% 
  replace(is.na(.), 0)

Reshaping data

df_hsd_energy_by_dwell <- as.data.frame(df_hsd_energy_by_dwell)

df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G009'] <- 'LPG'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G010'] <- 'Petroleum'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G011'] <- 'Diesel'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G012'] <- 'Carbon'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G013'] <- 'Firewood'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G014'] <- 'Fuel_to_heat'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G015'] <- 'Candles'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='G016'] <- 'Other'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='R001'] <- 'Electricity'
df_hsd_energy_by_dwell$clave[df_hsd_energy_by_dwell$clave=='R003'] <- 'NG'

#df_hsd_energy_by_dwell$clave <-as.factor(df_hsd_energy_by_dwell$clave)
df_hsd_by_dwell_wd <- reshape(df_hsd_energy_by_dwell, v.names = 'gasto_tri',
                            timevar='clave', idvar="folioviv", sep = "_",
                            direction="wide")
#Change columns names 
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_LPG')] <- 'LPG'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Petroleum')] <- 'Petroleum'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Diesel')] <- 'Diesel'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Carbon')] <- 'Carbon'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Firewood')] <- 'Firewood'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Fuel_to_heat')] <- 'Fuel_to_heat'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Candles')] <- 'Candles'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Other')] <- 'Other'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_Electricity')] <- 'Electricity'
colnames(df_hsd_by_dwell_wd)[which(names(df_hsd_by_dwell_wd) == 'gasto_tri_NG')] <- 'NG'

df_hsd_by_dwell_wd

Adding weighting factor linked to the representation of dwellings

df_dwell_factors <- subset(df_dwell, select=c('folioviv', 'est_socio', 'tam_loc', 
                                              'est_dis', 'upm', 'factor'))

df_hsd_by_dwell_wd <- merge(df_hsd_by_dwell_wd, df_dwell_factors, 
                          by="folioviv", all.y = TRUE, all.x = TRUE)
head(df_hsd_by_dwell_wd)

Descriptive Statistics

descr(df_hsd_by_dwell_wd, stats = c("mean", "sd", "min", "med", "max", "n.valid", "pct.valid"),
      weights = df_hsd_by_dwell_wd$factor,
      transpose = TRUE)

Box Plots

pal_plot <- c('grey','red', 'rgb(7,40,89)', 'green')
pal_plot <- setNames(pal_plot, c("elect", "ng", "candles", "lpg"))

#df_hsd_by_dwell_wd[is.na(df_hsd_by_dwell_wd)] = 0

pl_energy_exp_box <- plot_ly(type = 'box') %>%
  add_boxplot(y = df_hsd_by_dwell_wd$Electricity, 
              boxpoints = 'outliers', 
              name = "Electricity", 
              color = list(color =pal_plot['elect']),
              marker = list(color = pal_plot['elect']), 
              line=list(color = pal_plot['elect']) ) %>%
  add_boxplot(y = df_hsd_by_dwell_wd$NG, 
              boxpoints = 'outliers', 
              name = "Natural Gas", 
              color = list(color =pal_plot['ng']),
              marker = list(color = pal_plot['ng']), 
              line=list(color = pal_plot['ng']) ) %>%
  add_boxplot(y = df_hsd_by_dwell_wd$Candles, 
              boxpoints = 'outliers', 
              name = "Candles", 
              color = list(color =pal_plot['candles']),
              marker = list(color = pal_plot['candles']), 
              line=list(color = pal_plot['candles']) ) %>%
  add_boxplot(y = df_hsd_by_dwell_wd$LPG, 
              boxpoints = 'outliers', 
              name = "LPG", 
              color = list(color =pal_plot['lpg']),
              marker = list(color = pal_plot['lpg']), 
              line=list(color = pal_plot['lpg']) ) %>%
  layout(title = "Boxplots of Expenses per  Dwelling by Type of Energy", 
         yaxis = list(title = "$[MXN]", range = c(0, 5000)))

pl_energy_exp_box

plotly_IMAGE(pl_energy_exp_box, format = "png", out_file = "./figs/box_energy_exp_byDwell.png" )

Boxplots of energy expenses per dwelling

Store data of energy expenses by dwelling

write.csv(df_hsd_by_dwell_wd, file = 'outputs/2018_enigh_ener-exp_dwell_wide.csv',
          fileEncoding="UTF-8", row.names=FALSE)

Survey Design

df_dwell$tam_loc<- factor(df_dwell$tam_loc,
                   levels = c(1, 2, 3, 4),
                   labels = c(">=100k", "15K-100k", "2.5K-15K", "<2.5K") )

df_dwell$est_socio<- factor(df_dwell$est_socio,
                   levels = c(1, 2, 3, 4),
                   labels = c("low", "med-low", "med-high", "high") )
df_hsd_by_dwell_wd$upm <- NULL
df_hsd_by_dwell_wd$factor <- NULL
df_hsd_by_dwell_wd$est_dis <- NULL
df_hsd_by_dwell_wd$tam_loc <- NULL
df_hsd_by_dwell_wd$est_socio <- NULL

df_dwell_sub <- subset(df_dwell, select=c('folioviv', 'est_socio', 'tam_loc', 
                                                    'upm', 'factor', 'est_dis'))
head(df_dwell_sub)

df_energy_by_folio <- merge(df_dwell_sub, df_hsd_by_dwell_wd, 
                          by="folioviv", all.y = TRUE, all.x = TRUE) %>% 
  replace(is.na(.), 0)

head(df_energy_by_folio)
#Using Survey Design Construction
svd_dwell <- svydesign(id=~upm, strata=~est_dis, 
                 data=df_energy_by_folio, weights=~factor)

#Information about the survey design

class(svd_dwell)

#View the number of unique PSUs (clusters) in this survey design, by referring 
# to the clu column from the original data.frame:
length(unique(df_energy_by_folio$upm))

#View the number of unique strata in this survey design, by referring to the 
# est_dis column from the original data.frame:
length(unique(df_energy_by_folio$est_dis))
tb_mean_energy  <- svyby(~Electricity, ~est_socio + tam_loc, svd_dwell, svymean, keep.var=TRUE)
tb_mean_energy

tb.mean.ing.exp.soc.mun <- svyby(~ing_cor + gasto_mon, ~est_socio + tam_loc, svd.household, svymean, keep.var=TRUE) #tb.mean.ing.exp.soc.mun

# Convert it to a table
tb_mean_energy %>% 
  ftable() %>%
  round(3)
              tam_loc      >=100k    15K-100k    2.5K-15K       <2.5K
                      Electricity Electricity Electricity Electricity

est_socio
low svymean 442.323 357.744 303.395 295.821 SE 8.260 12.308 16.104 13.201 med-low svymean 570.893 529.596 535.516 477.117 SE 10.855 59.339 135.498 78.909 med-high svymean 764.845 871.679 656.343 296.401 SE 37.739 56.484 150.844 442.323 high svymean 1149.308 609.769 620.566 52.316 SE 23.884 15.056 6.492 570.893

df_mean_energy_bytamloc <- tb_mean_energy %>% 
  group_by(tam_loc) 
df_mean_energy_bytamloc
colnames(df_mean_energy_bytamloc)[which(names(df_mean_energy_bytamloc) == 'est_socio')] <- 'Socioeconomic_Status'

colnames(df_mean_energy_bytamloc)[which(names(df_mean_energy_bytamloc) == 'tam_loc')] <- 'Municipality_Size'


#tb.mean.ing.exp.soc.mun$est_socio
p <- ggplot(df_mean_energy_bytamloc, aes(x=Municipality_Size, y=Electricity, 
                                    ymin = Electricity-se, ymax = Electricity+se,
                                    fill=Socioeconomic_Status)) + 
  geom_bar(stat = "identity", position=position_dodge()) + 
  geom_errorbar(position=position_dodge()) + 
  scale_fill_brewer(palette="Greens") +
  scale_y_continuous(name="Electricity Expenses [MXN]", labels=dollar) +
  ggtitle("Quarterly Expenses in Electricity per Dwelling in 2018")

p <- p + theme(plot.title = element_text(color="black", size=16, face="bold"))
p


#ggsave("mtcars.pdf", width = 4, height = 4)
ggsave("./figs/ElectricityExpenses.png", width = 8, height = 6, units = "in")